org 100h ; assume cs<=0x2a18 [0xfd..ff]=0

T1 equ 27*4
T2 equ 19*4
T3 equ -32*4
T4 equ -23*4

C equ $+8

; ds is moved: all  constant access is [ss:bp+?]
%define w(xx) word[byte bp+si-0x100+xx]
%define d(xx) dword[byte bp+si-0x100+xx]

  push 0xa000   ;<-[bp+si] = scratch variable
; the 4 most significant bytes of qword[bp+di] is TAN
  lds bp,[si-3] ; bp=0, ds=0x6800  table: cos
  pop es              ; es=0xa000  screen
  mov ax,0x4f02
  mov fs,ax           ; fs=0x4f02  table: color_mul/cos
  mov bx,0x10e
  int 10h    ; 320x200 with 65536 colors; assume it's ok (ax=0x004f)

  add ax,0x39c9 ; 10 05 c9 39, should be "db 0f c9 39"
TABLE_STEP equ $-4  ; 0.000383495197 = 2pi / 16384 ~ 1 / (256 * pi^2)
  mov gs,ax           ; gs=0x3a18  backbuffer
;  fninit

;Cos table with 16384 entries
;It's not sin() 'cos I don't want 0.0 in the table.
COS_TAB:
  imul bx,[bp+di],4 ; bx=[ss:bp+di]=[ss:-2]=angle (0 on init)
  fild word[bp+di]
  fmul d(TABLE_STEP)
  fsincos              ;; cos(angle/65536*2pi): adjust period to 2pi
COLMUL equ $-1 ; -5
  inc word[bp+di] ; next angle
  fstp dword[bx]
  fidivr w(COLMUL)  ; color_mul / sin(...)
  fstp dword[fs:bx]
  jnz COS_TAB     ; bx=4

; Frame loop
M: ; bp=0 cx=timer

; Precompute the rotation matrix.
; We can do scale*cos() and tan()
;  instead of scale*cos() and scale*sin()
;  because there's no 0.0 in the cos table.
  imul bx,cx,T2
  fld dword[bx]         ;; cos(t2)
  fdiv d(ZOOM)          ;; scale_wave = cos(t2)/zoom

  imul bx,cx,T1
  fmul dword[bx]        ;; cos(t1) * scale_wave
  fsubr dword[bx]       ;; C=cos(t1) * (1 - scale_wave)
  fstp d(C)
  fld dword[bx-0x4000]  ;; sin(t1)
  fdiv dword[bx]        ;; TAN=sin(t1)/cos(t1)
  fstp qword[bp+si]     ; store full 8 bytes
  inc cx
ZOOM equ $-4  ; =9.679

; Pixel loop
X mov ax,0xcccd ; convert width 320 -> 65536
  mul di
  add dx,0x9c80 ; center
  xchg ax,bx    ; full 16-bit precision of X
  pusha ; [-18-16-14-12-10 -8 -6 -4] on the stack
        ;   di si bp sp bx dx cx ax
        ;                  yy
        ;                x x

  mov ax,0x4f05  ; twice per frame: set window, assume 64kB granularity
  cwd

  add di,di
  jnz NZ_DI
  adc dx,dx
  xor bx,bx      ; bh=0 bl=window=0 dx=page(0 or 1)
  int 10h
NZ_DI:

  mov ax,[gs:di]
  jc COPY_MIRROR

  push di
  mov dl,10         ; dx = number of iterations
  call IT
  pop di

  sub bp,di
  mov [gs:bp+320*200*2-2],ax
COPY_MIRROR:
  stosw

;  stosw   ; 2x faster: draw two pixels
  popa
;  inc di
  inc di
  jnz X  ; di=0

;  call SCREENSHOT

  in al,60h ; ESC check
  cmp al,1
  jne M     ; exit later

IT:
  ; [-18-16-14-12-10 -8 -6 -4]
  ;   di si bp sp bx dx cx ax
  ;                  yy
  ;                x x

Z fldz
  inc bp
  jpo Z   ; loop 3x  ;; R=0 G=0 B=0

  dec bp
XY:
  dec bp            ; bp:1,0 -> [-8],[-9] -> y,x

;  fldl2e
;  fimul word[bp-9] ;; x[-47274..47274] y R G B
  fild word[bp-9]
  fadd st0          ;; x[-65536..65536] y R G B

  jpo XY  ; loop 2x, bp=0 again, zero flag = 1

  jmp LEN
LEN_RET:
  imul di,[bp+si],4 ; di = d = 65536/2pi * length(x,y)/2

  imul ax,cx,T4
  sub ax,di         ; ax = t4-d

  imul cx,T3
  add di,cx         ; di = d-t3

  mov cx,0x8025  ; shift length, ~0x8000 xor constant, RGB phase shift (0x25)

; rotate and scale
; [x] = [C -S] * [x]
; [y]   [S  C]   [y]
R fld st1          ;; y x y R G B    | x Sy x Cy R G B
  fmul d(C)        ;; Cy x y R G B   | Cx Sy x Cy R G B
  fst st2          ;; Cy x Cy R G B  | Cx Sy Cx Cy R G B
  fmul qword[bp+si]; multiply by TAN
  dec cx           ;; Sy x Cy R G B  | Sx Sy Cx Cy R G B
  jpe R ; loop 2x: 0x25 o, 0x24 e, 0x23 o
  faddp st3,st0  ;; Sy Cx Sx+Cy R G B
  fsubp st1,st0  ;; x=Cx-Sy y=Sx+Cy R G B

; square fold, rotate and scale
F fistp dword[bp+si] ; wrap: keep only bottom 16 bits
  add word[bp+si],cx ;~0x8000
  fild word[bp+si] ;; x = x-round(x) | y = y-round(y)
  fxch st1
  inc cx
  jpe F ; loop 2x zero flag = 0

; interfering concentric circles

; subroutine: compute length of 2D vector, scale to access cos table
LEN: ;; x y -> [bp+si] = sqrt(x*x+y*y)/65536/2 * 16384/2pi = sqrt(x*x+y*y)*C = sqrt(C^2*(x*x+y*y))
  fld st1
  fmul st0
  fld st1
  fmul st0
  faddp
  fmul d(TABLE_STEP)  ; exact: (16384/2pi/65536/2)^2 = 0.000395785+
  fsqrt
  fistp word[bp+si]
  jz LEN_RET

; k = color_mul / cos(5*length(x,y) + d - t3)
; [R G B] += k * ( 1 - 2 * cos(3*(i/40 + t4-d) + [1.8 0.9 0]) );
  dec ah
  imul bx,ax,3      ; bx = q = 65536/2pi * 3*(i/40 + t4-d)
Q fld1
  fsub dword[bx]
  fsub dword[bx]    ;; 1-2cos(q) x y R G B
  add bh,cl;37      ; q += ~ 0.9 * 256/2pi
  inc bp
  jpo Q  ; loop 3x ;; [dR dG dB]=1-2cos(q+[1.8 0.9 0])) x y R G B  ; bp=3

  imul bx,[bp+si-3],10*4 ; 65536/2pi * 5*length(x,y)
K fmul dword[fs:bx+di] ; k = color_mul / cos(5*length(x,y) + d - t3)
  faddp st5,st0        ;; dG gB x y R+=k*dR G B
  dec bp
  jpo K ; loop 3x     ;; x y R+=k*dR G+=k*dG B+=k*dB

  dec dx
  jnz R   ; dx=0

  fcompp            ;; R G B

; Assemble RGB into 16-bit color.
; cl & 0x1f = shift length: cycle 5,6,5
COL:
  fmul st0          ;; R^2 G^2 B^2
  fistp word[bp+si] ; if it's > 0x7fff, store 0x8000
  imul bx,[bp+si],2 ; double, set carry if it was > 0x3fff
  sbb bx,bp         ; overflow -> 0xffff
  shld ax,bx,cl ; rrrrrggggggbbbbb
  xor cl,5^6
  inc si
  jpo COL ; loop 3x

  ret

;%include "screenshot320_16.inc"
